library(Seurat)
library(ggplot2)
library(grid)
library("GSEABase")
library(stringr)
library(dplyr)
library(Matrix)
library(parallel)
library(data.table)
library(SiPSiC)

# minimalClusterSize defines the percentage of cells that constitute the minimal expected cluster size 
minimalClusterSize = 10
logScalingConstant = 1
minNumOfGenesExpressed = 1000

filterData <- function(dataMatrix, isLogTPM, convertToCPM)
{
  filteredDataMatrix <- dataMatrix
  
  if (isLogTPM == TRUE)
  {
    filteredDataMatrix <- 2^(filteredDataMatrix) - logScalingConstant
  }
  
  # Filtering out cells which express less than the minimal number of genes
  expressedGenesCounters <- apply(filteredDataMatrix != 0, 2, sum)
  cellsWithAThousandPlus <- expressedGenesCounters >= minNumOfGenesExpressed
  filteredDataMatrix <- filteredDataMatrix[, cellsWithAThousandPlus]
  expressedGenesCounters <- expressedGenesCounters[cellsWithAThousandPlus]
  
  # Filtering out genes which are expressed by less than the minimal expected cluster size of cells
  nonZeroCellCountsForGenes <- apply(filteredDataMatrix != 0, 1, sum)
  totalCellsCount <- ncol(filteredDataMatrix)
  minNumOfCellsInClust <- totalCellsCount * (minimalClusterSize / 100)
  genesWithMinExpression <- (nonZeroCellCountsForGenes > minNumOfCellsInClust)
  filteredDataMatrix <- filteredDataMatrix[genesWithMinExpression,]
  
  # Converting the transcript counts to CPM
  if (convertToCPM == TRUE)
  {
    countSumsOfCells <- apply(filteredDataMatrix, 2, sum)
    filteredDataMatrix <- t(filteredDataMatrix)
    filteredDataMatrix <- (filteredDataMatrix / countSumsOfCells) * 1000000
    filteredDataMatrix <- t(filteredDataMatrix)
  }
  
  return (filteredDataMatrix)
}

# This function produces graphic representation of the pathway score differences between the different cell groups
# Note: Parameters controlCellNames and covidCellNames also include names of cells which are not in the dataMatrix.
executePathwayCalculations <- function(inputPathway, dataMatrix, controlCellNames, covidCellNames, cellType)
{
  pathwayGenes <- inputPathway@geneIds
  pathwayName <- inputPathway@setName
  pathwayScores <- try(getPathwayScores(dataMatrix, pathwayGenes))
  
  scoresAsDataFrame <- as.data.frame(pathwayScores$pathwayScore)
  scoresAsDataFrame$cellName <- rownames(scoresAsDataFrame)
  names(scoresAsDataFrame)[names(scoresAsDataFrame) == "pathwayScores$pathwayScore"] <- "Score"
  
  # Keeping all cell scores
  
  allCellNames <- colnames(dataMatrix)
  currPathwayScores <- scoresAsDataFrame[allCellNames, "Score"]
  names(currPathwayScores) <- allCellNames
  
  all_pathway_scores <<- rbind(all_pathway_scores, currPathwayScores)
  rownames(all_pathway_scores)[nrow(all_pathway_scores)] <<- pathwayName
  
  # Annotating the cells as COVID / control
  
  scoresAsDataFrame$cellType <- "COVID"
  controlCellIndices <- scoresAsDataFrame$cellName %in% controlCellNames
  scoresAsDataFrame$cellType[controlCellIndices] <- "CONTROL"
  covidCellIndices <- (scoresAsDataFrame$cellType == "COVID")
  
  # Performing a T test comparison between the COVID and control cells
  
  covidCellScores <- scoresAsDataFrame[covidCellIndices, "Score"]
  controlCellScores <- scoresAsDataFrame[controlCellIndices, "Score"]
  T.TestResult <- t.test(covidCellScores, controlCellScores)
  currPValue <- T.TestResult$p.value
  
  Original.P.Values[pathwayName] <<- currPValue
  
  # Calculating the effect size and generating graphics
  effectSize <- median(covidCellScores) - median(controlCellScores)
  allEffectSizes[pathwayName] <<- effectSize
  
  
  violinPlot <- ggplot(scoresAsDataFrame, aes(x = cellType, y = Score, fill = cellType)) +
    ggtitle(paste0("UNADJUSTED (!) P value:\t", "P < ", currPValue, "\n",
                   "Effect size is: ", effectSize)) +
    geom_violin(trim=FALSE) + geom_boxplot(width=0.1)
  
  pdf(paste0("NormalizedCounts_", pathwayName, "_", cellType, "_Cells", ".pdf"))
  print(violinPlot)
  dev.off()
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MAIN program starts here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

setwd("")
GMT_FILE_NAME <- "h.all.v7.0.symbols.pluscc.gmt"
genesets_name = "hallmarks"
genesets <- getGmt(GMT_FILE_NAME)
metaData <- read.delim("lung_metaData.txt", header = TRUE, sep = "\t")


# Fetching the names of the cells to be used for the comparison

airway_alveolar_epithelium <- metaData[grepl("^(AT)", metaData[,"cell_type_fine"],ignore.case = FALSE),]
Activated_B_cells <- metaData[metaData["cell_type_fine"] == "Activated B cells",]
CD8_T_Cells <- metaData[metaData["cell_type_fine"] == "CD8+ T cells",]

relevantMetaData <- rbind(airway_alveolar_epithelium, Activated_B_cells, CD8_T_Cells)

# Reading raw counts data of the control patients

ControlPatient1 <- as.data.frame(as_tibble(fread("GSM5226574_C51ctr_raw_counts.csv", sep = ",", header = TRUE)))
ControlPatient2 <- as.data.frame(as_tibble(fread("GSM5226575_C52ctr_raw_counts.csv", sep = ",", header = TRUE)))
ControlPatient3 <- as.data.frame(as_tibble(fread("GSM5226576_C53ctr_raw_counts.csv", sep = ",", header = TRUE)))
ControlPatient4 <- as.data.frame(as_tibble(fread("GSM5226577_C54ctr_raw_counts.csv", sep = ",", header = TRUE)))
ControlPatient5 <- as.data.frame(as_tibble(fread("GSM5226578_C55ctr_raw_counts.csv", sep = ",", header = TRUE)))
ControlPatient6 <- as.data.frame(as_tibble(fread("GSM5226579_C56ctr_raw_counts.csv", sep = ",", header = TRUE)))
ControlPatient7 <- as.data.frame(as_tibble(fread("GSM5226580_C57ctr_raw_counts.csv", sep = ",", header = TRUE)))

# Binding all control patient cells together and excluding those of irrelevant cell types
controlCells <- cbind(ControlPatient1, ControlPatient2, ControlPatient3, ControlPatient4, ControlPatient5, ControlPatient6, ControlPatient7)

# Releasing occupied memory
rm(ControlPatient1, ControlPatient2, ControlPatient3, ControlPatient4, ControlPatient5, ControlPatient6, ControlPatient7)
gc()

rownames(controlCells) <- controlCells[,"V1"]
controlCells <- controlCells[, 2:ncol(controlCells)]
colnames(controlCells) <- gsub(".", "-", colnames(controlCells), fixed = TRUE)
relevantCells <- colnames(controlCells) %in% relevantMetaData[,"NAME"]
controlCells <- controlCells[,relevantCells]
controlCellNames <- colnames(controlCells)


# Note: There are no COVID patients 2, 14 and 20 in the data files provided! 
# The variables, though, are numbered consecutively for comfort.
COVIDPatient1 <- as.data.frame(as_tibble(fread("GSM5226581_L01cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient2 <- as.data.frame(as_tibble(fread("GSM5226582_L03cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient3 <- as.data.frame(as_tibble(fread("GSM5226583_L04cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient4 <- as.data.frame(as_tibble(fread("GSM5226585_L05cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient5 <- as.data.frame(as_tibble(fread("GSM5226586_L06cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient6 <- as.data.frame(as_tibble(fread("GSM5226587_L07cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient7 <- as.data.frame(as_tibble(fread("GSM5226588_L08cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient8 <- as.data.frame(as_tibble(fread("GSM5226589_L09cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient9 <- as.data.frame(as_tibble(fread("GSM5226590_L10cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient10 <- as.data.frame(as_tibble(fread("GSM5226591_L11cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient11 <- as.data.frame(as_tibble(fread("GSM5226592_L12cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient12 <- as.data.frame(as_tibble(fread("GSM5226593_L13cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient13 <- as.data.frame(as_tibble(fread("GSM5226594_L15cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient14 <- as.data.frame(as_tibble(fread("GSM5226595_L16cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient15 <- as.data.frame(as_tibble(fread("GSM5226596_L17cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient16 <- as.data.frame(as_tibble(fread("GSM5226597_L18cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient17 <- as.data.frame(as_tibble(fread("GSM5226598_L19cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient18 <- as.data.frame(as_tibble(fread("GSM5226599_L21cov_raw_counts.csv", sep = ",", header = TRUE)))
COVIDPatient19 <- as.data.frame(as_tibble(fread("GSM5226600_L22cov_raw_counts.csv", sep = ",", header = TRUE)))
 
covidCells <- cbind(COVIDPatient1, COVIDPatient2, COVIDPatient3, COVIDPatient4, COVIDPatient5, COVIDPatient6, COVIDPatient7,
                    COVIDPatient8, COVIDPatient9, COVIDPatient10, COVIDPatient11, COVIDPatient12, COVIDPatient13, COVIDPatient14,
                    COVIDPatient15, COVIDPatient16, COVIDPatient17, COVIDPatient18, COVIDPatient19)

rm(COVIDPatient1, COVIDPatient2, COVIDPatient3, COVIDPatient4, COVIDPatient5, COVIDPatient6, COVIDPatient7,
   COVIDPatient8, COVIDPatient9, COVIDPatient10, COVIDPatient11, COVIDPatient12, COVIDPatient13, COVIDPatient14,
   COVIDPatient15, COVIDPatient16, COVIDPatient17, COVIDPatient18, COVIDPatient19)
gc()

# Only keeping raw data of COVID cells of the relevant cell types
rownames(covidCells) <- covidCells[,"V1"]
covidCells <- covidCells[, 2:ncol(covidCells)]
colnames(covidCells) <- gsub(".", "-", colnames(covidCells), fixed = TRUE)
relevantCovidCells <- colnames(covidCells) %in% relevantMetaData[,"NAME"]
covidCells <- covidCells[,relevantCovidCells]
covidCellNames <- colnames(covidCells)

allRelevantCells <- cbind(controlCells, covidCells)

# Splitting the data into the different cell types for score calculations
epitheliumCellType <- colnames(allRelevantCells) %in% airway_alveolar_epithelium[, "NAME"]
BCellType <- colnames(allRelevantCells) %in% Activated_B_cells[, "NAME"]
TCellType <- colnames(allRelevantCells) %in% CD8_T_Cells[, "NAME"]

# Filtering the data for every cell type separately 
allEpithelialCells <- allRelevantCells[,epitheliumCellType]
allEpithelialCells <- filterData(dataMatrix = allEpithelialCells, 
                                 isLogTPM = FALSE, convertToCPM = TRUE)

allBCells <- allRelevantCells[,BCellType]
allBCells <- filterData(dataMatrix = allBCells, isLogTPM = FALSE, convertToCPM = TRUE)

allTCells <- allRelevantCells[,TCellType]
allTCells <- filterData(dataMatrix = allTCells, isLogTPM = FALSE, convertToCPM = TRUE)

all_pathway_scores <- numeric()
Original.P.Values <- numeric()
allEffectSizes <- numeric()

# Comparing scores of the control and healthy cells, separately for each cell type

lapply(genesets@.Data, executePathwayCalculations, allEpithelialCells, controlCellNames, covidCellNames, cellType = "Epithelium")

saveRDS(all_pathway_scores, file = "Epithelial_Cells_All_Pathway_Scores.RDS")
Adjusted.P.Values <- p.adjust(Original.P.Values, method = "BH", n = length(Original.P.Values))
write.csv2(Adjusted.P.Values, file = "FDR_Values_EpithelialCells.csv")
write.csv2(allEffectSizes, file = "EffectSizes_EpithelialCells.csv")

all_pathway_scores <- numeric()
Original.P.Values <- numeric()
allEffectSizes <- numeric()

lapply(genesets@.Data, executePathwayCalculations, allBCells, controlCellNames, covidCellNames, cellType = "BCells")

saveRDS(all_pathway_scores, file = "B_Cells_All_Pathway_Scores.RDS")
Adjusted.P.Values <- p.adjust(Original.P.Values, method = "BH", n = length(Original.P.Values))
write.csv2(Adjusted.P.Values, file = "FDR_Values_BCells.csv")
write.csv2(allEffectSizes, file = "EffectSizes_BCells.csv")

all_pathway_scores <- numeric()
Original.P.Values <- numeric()
allEffectSizes <- numeric()

lapply(genesets@.Data, executePathwayCalculations, allTCells, controlCellNames, covidCellNames, cellType = "TCells")

saveRDS(all_pathway_scores, file = "T_Cells_All_Pathway_Scores.RDS")
Adjusted.P.Values <- p.adjust(Original.P.Values, method = "BH", n = length(Original.P.Values))
write.csv2(Adjusted.P.Values, file = "FDR_Values_TCells.csv")
write.csv2(allEffectSizes, file = "EffectSizes_TCells.csv")
